In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import zscore
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering 

Reading the database from excel and seeing at the data

In [7]:
veh = pd.read_csv(r'C:\Users\Akash Barwad\Documents\AIML\documents\Project Data\Project-5 - Unsupervised Learning\vehicle-1.csv')
In [8]:
vehtry = pd.read_csv(r'C:\Users\Akash Barwad\Documents\AIML\documents\Project Data\Project-5 - Unsupervised Learning\vehicle-1.csv')
In [9]:
veh.head(30)
Out[9]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95.0 48.0 83.0 178.0 72.0 10.0 162.0 42.0 20.0 159.0 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197.0 van
1 91.0 41.0 84.0 141.0 57.0 9.0 149.0 45.0 19.0 143.0 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199.0 van
2 104.0 50.0 106.0 209.0 66.0 10.0 207.0 32.0 23.0 158.0 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196.0 car
3 93.0 41.0 82.0 159.0 63.0 9.0 144.0 46.0 19.0 143.0 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207.0 van
4 85.0 44.0 70.0 205.0 103.0 52.0 149.0 45.0 19.0 144.0 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183.0 bus
5 107.0 NaN 106.0 172.0 50.0 6.0 255.0 26.0 28.0 169.0 280.0 957.0 264.0 85.0 5.0 9.0 181.0 183.0 bus
6 97.0 43.0 73.0 173.0 65.0 6.0 153.0 42.0 19.0 143.0 176.0 361.0 172.0 66.0 13.0 1.0 200.0 204.0 bus
7 90.0 43.0 66.0 157.0 65.0 9.0 137.0 48.0 18.0 146.0 162.0 281.0 164.0 67.0 3.0 3.0 193.0 202.0 van
8 86.0 34.0 62.0 140.0 61.0 7.0 122.0 54.0 17.0 127.0 141.0 223.0 112.0 64.0 2.0 14.0 200.0 208.0 van
9 93.0 44.0 98.0 NaN 62.0 11.0 183.0 36.0 22.0 146.0 202.0 505.0 152.0 64.0 4.0 14.0 195.0 204.0 car
10 86.0 36.0 70.0 143.0 61.0 9.0 133.0 50.0 18.0 130.0 153.0 266.0 127.0 66.0 2.0 10.0 194.0 202.0 van
11 90.0 34.0 66.0 136.0 55.0 6.0 123.0 54.0 17.0 118.0 148.0 224.0 118.0 65.0 5.0 26.0 196.0 202.0 car
12 88.0 46.0 74.0 171.0 68.0 6.0 152.0 43.0 19.0 148.0 180.0 349.0 192.0 71.0 5.0 11.0 189.0 195.0 bus
13 89.0 42.0 85.0 144.0 58.0 10.0 152.0 44.0 19.0 144.0 173.0 345.0 161.0 72.0 8.0 13.0 187.0 197.0 van
14 94.0 49.0 79.0 203.0 71.0 5.0 174.0 37.0 21.0 154.0 196.0 465.0 206.0 71.0 6.0 2.0 197.0 199.0 bus
15 96.0 55.0 103.0 201.0 65.0 9.0 204.0 32.0 23.0 166.0 227.0 624.0 246.0 74.0 6.0 2.0 186.0 194.0 car
16 89.0 36.0 51.0 109.0 52.0 6.0 118.0 57.0 17.0 129.0 137.0 206.0 125.0 80.0 2.0 14.0 181.0 185.0 van
17 99.0 41.0 77.0 197.0 69.0 6.0 177.0 36.0 21.0 139.0 202.0 485.0 151.0 72.0 4.0 10.0 198.0 199.0 bus
18 104.0 54.0 100.0 186.0 61.0 10.0 216.0 31.0 24.0 173.0 225.0 686.0 220.0 74.0 5.0 11.0 185.0 195.0 car
19 101.0 56.0 100.0 215.0 NaN 10.0 208.0 32.0 24.0 169.0 227.0 651.0 223.0 74.0 6.0 5.0 186.0 193.0 car
20 84.0 47.0 75.0 153.0 64.0 6.0 154.0 43.0 19.0 145.0 175.0 354.0 184.0 75.0 0.0 3.0 185.0 192.0 bus
21 84.0 37.0 53.0 121.0 59.0 5.0 123.0 55.0 17.0 125.0 141.0 221.0 133.0 82.0 7.0 1.0 179.0 183.0 van
22 94.0 43.0 64.0 173.0 69.0 7.0 150.0 43.0 19.0 142.0 169.0 344.0 177.0 68.0 9.0 1.0 199.0 206.0 bus
23 87.0 39.0 70.0 148.0 61.0 7.0 143.0 46.0 18.0 136.0 164.0 307.0 141.0 69.0 1.0 2.0 192.0 199.0 bus
24 99.0 53.0 105.0 219.0 66.0 11.0 204.0 32.0 23.0 165.0 221.0 623.0 224.0 68.0 0.0 6.0 191.0 201.0 car
25 85.0 45.0 80.0 154.0 64.0 9.0 147.0 45.0 19.0 148.0 169.0 324.0 174.0 71.0 1.0 4.0 188.0 199.0 van
26 83.0 36.0 54.0 119.0 57.0 6.0 128.0 53.0 18.0 125.0 143.0 238.0 139.0 82.0 6.0 3.0 179.0 183.0 car
27 107.0 54.0 98.0 203.0 65.0 11.0 218.0 31.0 25.0 167.0 229.0 696.0 216.0 72.0 1.0 28.0 187.0 199.0 car
28 102.0 45.0 85.0 193.0 64.0 6.0 192.0 33.0 22.0 146.0 217.0 570.0 163.0 76.0 6.0 7.0 195.0 193.0 bus
29 80.0 38.0 63.0 129.0 55.0 7.0 146.0 46.0 19.0 130.0 168.0 314.0 158.0 83.0 9.0 20.0 180.0 185.0 car
In [96]:
veh.columns
Out[96]:
Index(['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
       'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
       'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
       'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
       'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
       'skewness_about.2', 'hollows_ratio', 'class'],
      dtype='object')

Understanding the shape and size of the database

In [97]:
veh.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847 entries, 0 to 846
Data columns (total 19 columns):
compactness                    846 non-null float64
circularity                    841 non-null float64
distance_circularity           842 non-null float64
radius_ratio                   840 non-null float64
pr.axis_aspect_ratio           844 non-null float64
max.length_aspect_ratio        846 non-null float64
scatter_ratio                  845 non-null float64
elongatedness                  845 non-null float64
pr.axis_rectangularity         843 non-null float64
max.length_rectangularity      846 non-null float64
scaled_variance                843 non-null float64
scaled_variance.1              844 non-null float64
scaled_radius_of_gyration      844 non-null float64
scaled_radius_of_gyration.1    842 non-null float64
skewness_about                 840 non-null float64
skewness_about.1               845 non-null float64
skewness_about.2               845 non-null float64
hollows_ratio                  846 non-null float64
class                          846 non-null object
dtypes: float64(18), object(1)
memory usage: 125.8+ KB
In [66]:
for feature in veh.columns: # Loop through all columns in the dataframe
    if veh[feature].dtype == 'object': # Only apply for columns with categorical strings
        veh[feature] = pd.Categorical(veh[feature]).codes # Replace strings with an integer
In [67]:
veh.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847 entries, 0 to 846
Data columns (total 19 columns):
compactness                    846 non-null float64
circularity                    841 non-null float64
distance_circularity           842 non-null float64
radius_ratio                   840 non-null float64
pr.axis_aspect_ratio           844 non-null float64
max.length_aspect_ratio        846 non-null float64
scatter_ratio                  845 non-null float64
elongatedness                  845 non-null float64
pr.axis_rectangularity         843 non-null float64
max.length_rectangularity      846 non-null float64
scaled_variance                843 non-null float64
scaled_variance.1              844 non-null float64
scaled_radius_of_gyration      844 non-null float64
scaled_radius_of_gyration.1    842 non-null float64
skewness_about                 840 non-null float64
skewness_about.1               845 non-null float64
skewness_about.2               845 non-null float64
hollows_ratio                  846 non-null float64
class                          847 non-null int8
dtypes: float64(18), int8(1)
memory usage: 120.0 KB
In [ ]:
 
In [98]:
veh.isnull().values.any()
Out[98]:
True

From the above data we assume that the there are null values present in the database also, there are object datatypes which we have taken care of in by changinfg them to integer value.

In [99]:
veh.describe().T
Out[99]:
count mean std min 25% 50% 75% max
compactness 846.0 93.678487 8.234474 73.0 87.00 93.0 100.0 119.0
circularity 841.0 44.828775 6.152172 33.0 40.00 44.0 49.0 59.0
distance_circularity 842.0 82.110451 15.778292 40.0 70.00 80.0 98.0 112.0
radius_ratio 840.0 168.888095 33.520198 104.0 141.00 167.0 195.0 333.0
pr.axis_aspect_ratio 844.0 61.678910 7.891463 47.0 57.00 61.0 65.0 138.0
max.length_aspect_ratio 846.0 8.567376 4.601217 2.0 7.00 8.0 10.0 55.0
scatter_ratio 845.0 168.901775 33.214848 112.0 147.00 157.0 198.0 265.0
elongatedness 845.0 40.933728 7.816186 26.0 33.00 43.0 46.0 61.0
pr.axis_rectangularity 843.0 20.582444 2.592933 17.0 19.00 20.0 23.0 29.0
max.length_rectangularity 846.0 147.998818 14.515652 118.0 137.00 146.0 159.0 188.0
scaled_variance 843.0 188.631079 31.411004 130.0 167.00 179.0 217.0 320.0
scaled_variance.1 844.0 439.494076 176.666903 184.0 318.00 363.5 587.0 1018.0
scaled_radius_of_gyration 844.0 174.709716 32.584808 109.0 149.00 173.5 198.0 268.0
scaled_radius_of_gyration.1 842.0 72.447743 7.486190 59.0 67.00 71.5 75.0 135.0
skewness_about 840.0 6.364286 4.920649 0.0 2.00 6.0 9.0 22.0
skewness_about.1 845.0 12.602367 8.936081 0.0 5.00 11.0 19.0 41.0
skewness_about.2 845.0 188.919527 6.155809 176.0 184.00 188.0 193.0 206.0
hollows_ratio 846.0 195.632388 7.438797 181.0 190.25 197.0 201.0 211.0
In [100]:
veh.shape
Out[100]:
(847, 19)
In [101]:
veh.size
Out[101]:
16093
In [14]:
sns.pairplot(veh)
C:\Users\Akash Barwad\Anaconda3\lib\site-packages\numpy\lib\histograms.py:824: RuntimeWarning: invalid value encountered in greater_equal
  keep = (tmp_a >= first_edge)
C:\Users\Akash Barwad\Anaconda3\lib\site-packages\numpy\lib\histograms.py:825: RuntimeWarning: invalid value encountered in less_equal
  keep &= (tmp_a <= last_edge)
Out[14]:
<seaborn.axisgrid.PairGrid at 0x1e01782ed30>
In [72]:
veh.columns
Out[72]:
Index(['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
       'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
       'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
       'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
       'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
       'skewness_about.2', 'hollows_ratio', 'class'],
      dtype='object')

With .describe function we identified that there is presence of outliers. we also checked the same visually with the help of pair plot. now we will see the number of outliers and how they are spread across the database.

In [73]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=veh['radius_ratio'],whis=1.5)
In [74]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=veh['pr.axis_aspect_ratio'],whis=1.5)
In [75]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=veh['max.length_aspect_ratio'],whis=1.5)
In [76]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=veh['scaled_variance'],whis=1.5)
In [20]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=veh['scaled_variance.1'],whis=1.5)
In [102]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=veh['scaled_radius_of_gyration.1'],whis=1.5)
In [103]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=veh['skewness_about'],whis=1.5)

Visually also it is confirmed that the outliers are present in the database.

In [10]:
veh.mean()
Out[10]:
compactness                     93.678487
circularity                     44.828775
distance_circularity            82.110451
radius_ratio                   168.888095
pr.axis_aspect_ratio            61.678910
max.length_aspect_ratio          8.567376
scatter_ratio                  168.901775
elongatedness                   40.933728
pr.axis_rectangularity          20.582444
max.length_rectangularity      147.998818
scaled_variance                188.631079
scaled_variance.1              439.494076
scaled_radius_of_gyration      174.709716
scaled_radius_of_gyration.1     72.447743
skewness_about                   6.364286
skewness_about.1                12.602367
skewness_about.2               188.919527
hollows_ratio                  195.632388
dtype: float64

We will go for the co relation between all the columns also visually with heatmap.

In [81]:
cor = veh[['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
       'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
       'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
       'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
       'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
       'skewness_about.2', 'hollows_ratio', 'class']].corr()

cor
Out[81]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
compactness 1.000000 0.689786 0.791707 0.691081 0.091779 0.148249 0.812770 -0.788736 0.814248 0.676143 0.764361 0.818674 0.585845 -0.250603 0.236685 0.157670 0.298528 0.365552 -0.033796
circularity 0.689786 1.000000 0.797180 0.625051 0.154283 0.251407 0.858265 -0.827246 0.856603 0.965729 0.806791 0.850863 0.935950 0.053080 0.144968 -0.011869 -0.106339 0.045652 -0.160546
distance_circularity 0.791707 0.797180 1.000000 0.771748 0.158684 0.264621 0.907949 -0.913020 0.896273 0.775149 0.865710 0.890541 0.706950 -0.227001 0.114665 0.266049 0.146027 0.333648 -0.065209
radius_ratio 0.691081 0.625051 0.771748 1.000000 0.665363 0.450486 0.738480 -0.792946 0.712744 0.571083 0.798294 0.725598 0.541325 -0.181520 0.049112 0.174469 0.382912 0.472339 -0.182921
pr.axis_aspect_ratio 0.091779 0.154283 0.158684 0.665363 1.000000 0.648861 0.103832 -0.183492 0.079566 0.127322 0.273738 0.089750 0.122454 0.152860 -0.058539 -0.032180 0.240201 0.267760 -0.098431
max.length_aspect_ratio 0.148249 0.251407 0.264621 0.450486 0.648861 1.000000 0.165998 -0.180053 0.161603 0.305943 0.319033 0.143745 0.189752 0.295638 0.015446 0.043491 -0.026184 0.143919 0.207619
scatter_ratio 0.812770 0.858265 0.907949 0.738480 0.103832 0.165998 1.000000 -0.973504 0.992078 0.810017 0.951672 0.996328 0.800577 -0.028006 0.074376 0.213512 0.005171 0.118504 -0.288904
elongatedness -0.788736 -0.827246 -0.913020 -0.792946 -0.183492 -0.180053 -0.973504 1.000000 -0.950405 -0.776150 -0.938313 -0.956488 -0.766671 0.103535 -0.052243 -0.186027 -0.114846 -0.216769 0.339348
pr.axis_rectangularity 0.814248 0.856603 0.896273 0.712744 0.079566 0.161603 0.992078 -0.950405 1.000000 0.813135 0.938182 0.992316 0.798522 -0.015711 0.083219 0.215200 -0.019066 0.099481 -0.259102
max.length_rectangularity 0.676143 0.965729 0.775149 0.571083 0.127322 0.305943 0.810017 -0.776150 0.813135 1.000000 0.746657 0.797485 0.866554 0.041283 0.136077 0.001660 -0.104437 0.076770 -0.032399
scaled_variance 0.764361 0.806791 0.865710 0.798294 0.273738 0.319033 0.951672 -0.938313 0.938182 0.746657 1.000000 0.949766 0.781016 0.112452 0.036165 0.196202 0.014434 0.086708 -0.312836
scaled_variance.1 0.818674 0.850863 0.890541 0.725598 0.089750 0.143745 0.996328 -0.956488 0.992316 0.797485 0.949766 1.000000 0.797318 -0.016642 0.077288 0.202398 0.006648 0.103839 -0.288146
scaled_radius_of_gyration 0.585845 0.935950 0.706950 0.541325 0.122454 0.189752 0.800577 -0.766671 0.798522 0.866554 0.781016 0.797318 1.000000 0.192245 0.166785 -0.056067 -0.225882 -0.118597 -0.250967
scaled_radius_of_gyration.1 -0.250603 0.053080 -0.227001 -0.181520 0.152860 0.295638 -0.028006 0.103535 -0.015711 0.041283 0.112452 -0.016642 0.192245 1.000000 -0.088736 -0.126686 -0.752437 -0.804793 -0.213049
skewness_about 0.236685 0.144968 0.114665 0.049112 -0.058539 0.015446 0.074376 -0.052243 0.083219 0.136077 0.036165 0.077288 0.166785 -0.088736 1.000000 -0.035154 0.115728 0.097293 0.119652
skewness_about.1 0.157670 -0.011869 0.266049 0.174469 -0.032180 0.043491 0.213512 -0.186027 0.215200 0.001660 0.196202 0.202398 -0.056067 -0.126686 -0.035154 1.000000 0.077460 0.205115 -0.010674
skewness_about.2 0.298528 -0.106339 0.146027 0.382912 0.240201 -0.026184 0.005171 -0.114846 -0.019066 -0.104437 0.014434 0.006648 -0.225882 -0.752437 0.115728 0.077460 1.000000 0.893869 0.067251
hollows_ratio 0.365552 0.045652 0.333648 0.472339 0.267760 0.143919 0.118504 -0.216769 0.099481 0.076770 0.086708 0.103839 -0.118597 -0.804793 0.097293 0.205115 0.893869 1.000000 0.235874
class -0.033796 -0.160546 -0.065209 -0.182921 -0.098431 0.207619 -0.288904 0.339348 -0.259102 -0.032399 -0.312836 -0.288146 -0.250967 -0.213049 0.119652 -0.010674 0.067251 0.235874 1.000000
In [29]:
sns.heatmap(cor,annot=True,annot_kws={"size": 1.0},cmap='BuPu',vmin=-1,vmax=1)
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e036d70ac8>

I found this code online to check the presence of outliers and see the exact positions where the outliers are present so tried the same and replaced them with mean values.

In [107]:
train = veh

null_columns=train.columns[train.isnull().any()]

train[null_columns].isnull().sum()
Out[107]:
compactness                    1
circularity                    6
distance_circularity           5
radius_ratio                   7
pr.axis_aspect_ratio           3
max.length_aspect_ratio        1
scatter_ratio                  2
elongatedness                  2
pr.axis_rectangularity         4
max.length_rectangularity      1
scaled_variance                4
scaled_variance.1              3
scaled_radius_of_gyration      3
scaled_radius_of_gyration.1    5
skewness_about                 7
skewness_about.1               2
skewness_about.2               2
hollows_ratio                  1
class                          1
dtype: int64
In [108]:
print(train[train.isnull().any(axis=1)][null_columns].head())
    compactness  circularity  distance_circularity  radius_ratio  \
5         107.0          NaN                 106.0         172.0   
9          93.0         44.0                  98.0           NaN   
19        101.0         56.0                 100.0         215.0   
35        100.0         46.0                   NaN         172.0   
66         81.0         43.0                  68.0         125.0   

    pr.axis_aspect_ratio  max.length_aspect_ratio  scatter_ratio  \
5                   50.0                      6.0          255.0   
9                   62.0                     11.0          183.0   
19                   NaN                     10.0          208.0   
35                  67.0                      9.0          157.0   
66                  57.0                      8.0          149.0   

    elongatedness  pr.axis_rectangularity  max.length_rectangularity  \
5            26.0                    28.0                      169.0   
9            36.0                    22.0                      146.0   
19           32.0                    24.0                      169.0   
35           43.0                    20.0                      150.0   
66           46.0                    19.0                      146.0   

    scaled_variance  scaled_variance.1  scaled_radius_of_gyration  \
5             280.0              957.0                      264.0   
9             202.0              505.0                      152.0   
19            227.0              651.0                      223.0   
35            170.0              363.0                      184.0   
66            169.0              323.0                      172.0   

    scaled_radius_of_gyration.1  skewness_about  skewness_about.1  \
5                          85.0             5.0               9.0   
9                          64.0             4.0              14.0   
19                         74.0             6.0               5.0   
35                         67.0            17.0               7.0   
66                          NaN             NaN              18.0   

    skewness_about.2  hollows_ratio class  
5              181.0          183.0   bus  
9              195.0          204.0   car  
19             186.0          193.0   car  
35             192.0          200.0   van  
66             179.0          184.0   bus  
In [111]:
veh1=veh.fillna(veh.mean())
In [112]:
train = veh1

null_columns=train.columns[train.isnull().any()]

train[null_columns].isnull().sum()
Out[112]:
class    1
dtype: int64
In [128]:
print(train[train.isnull().any(axis=1)][null_columns].head())
    class
846   NaN
In [129]:
veh1=veh1.dropna()
In [134]:
veh.shape
Out[134]:
(847, 19)
In [ ]:
 
In [135]:
veh1.head(10)
Out[135]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95.0 48.000000 83.0 178.000000 72.0 10.0 162.0 42.0 20.0 159.0 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197.0 van
1 91.0 41.000000 84.0 141.000000 57.0 9.0 149.0 45.0 19.0 143.0 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199.0 van
2 104.0 50.000000 106.0 209.000000 66.0 10.0 207.0 32.0 23.0 158.0 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196.0 car
3 93.0 41.000000 82.0 159.000000 63.0 9.0 144.0 46.0 19.0 143.0 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207.0 van
4 85.0 44.000000 70.0 205.000000 103.0 52.0 149.0 45.0 19.0 144.0 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183.0 bus
5 107.0 44.828775 106.0 172.000000 50.0 6.0 255.0 26.0 28.0 169.0 280.0 957.0 264.0 85.0 5.0 9.0 181.0 183.0 bus
6 97.0 43.000000 73.0 173.000000 65.0 6.0 153.0 42.0 19.0 143.0 176.0 361.0 172.0 66.0 13.0 1.0 200.0 204.0 bus
7 90.0 43.000000 66.0 157.000000 65.0 9.0 137.0 48.0 18.0 146.0 162.0 281.0 164.0 67.0 3.0 3.0 193.0 202.0 van
8 86.0 34.000000 62.0 140.000000 61.0 7.0 122.0 54.0 17.0 127.0 141.0 223.0 112.0 64.0 2.0 14.0 200.0 208.0 van
9 93.0 44.000000 98.0 168.888095 62.0 11.0 183.0 36.0 22.0 146.0 202.0 505.0 152.0 64.0 4.0 14.0 195.0 204.0 car

We found out that there are 1-10 number of outliers present in database so we thought to remove them and same has been done with the help of IQR technique. Kindly confirm if this is a good idea in this case or not.

In [136]:
Q1 = veh.quantile(0.25)
Q3 = veh.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
compactness                     13.00
circularity                      9.00
distance_circularity            28.00
radius_ratio                    54.00
pr.axis_aspect_ratio             8.00
max.length_aspect_ratio          3.00
scatter_ratio                   51.00
elongatedness                   13.00
pr.axis_rectangularity           4.00
max.length_rectangularity       22.00
scaled_variance                 50.00
scaled_variance.1              269.00
scaled_radius_of_gyration       49.00
scaled_radius_of_gyration.1      8.00
skewness_about                   7.00
skewness_about.1                14.00
skewness_about.2                 9.00
hollows_ratio                   10.75
dtype: float64
In [137]:
print(veh1 < (Q1 - 1.5 * IQR)) 
(veh1 > (Q3 + 1.5 * IQR))
     circularity  class  compactness  distance_circularity  elongatedness  \
0          False  False        False                 False          False   
1          False  False        False                 False          False   
2          False  False        False                 False          False   
3          False  False        False                 False          False   
4          False  False        False                 False          False   
5          False  False        False                 False          False   
6          False  False        False                 False          False   
7          False  False        False                 False          False   
8          False  False        False                 False          False   
9          False  False        False                 False          False   
10         False  False        False                 False          False   
11         False  False        False                 False          False   
12         False  False        False                 False          False   
13         False  False        False                 False          False   
14         False  False        False                 False          False   
15         False  False        False                 False          False   
16         False  False        False                 False          False   
17         False  False        False                 False          False   
18         False  False        False                 False          False   
19         False  False        False                 False          False   
20         False  False        False                 False          False   
21         False  False        False                 False          False   
22         False  False        False                 False          False   
23         False  False        False                 False          False   
24         False  False        False                 False          False   
25         False  False        False                 False          False   
26         False  False        False                 False          False   
27         False  False        False                 False          False   
28         False  False        False                 False          False   
29         False  False        False                 False          False   
..           ...    ...          ...                   ...            ...   
816        False  False        False                 False          False   
817        False  False        False                 False          False   
818        False  False        False                 False          False   
819        False  False        False                 False          False   
820        False  False        False                 False          False   
821        False  False        False                 False          False   
822        False  False        False                 False          False   
823        False  False        False                 False          False   
824        False  False        False                 False          False   
825        False  False        False                 False          False   
826        False  False        False                 False          False   
827        False  False        False                 False          False   
828        False  False        False                 False          False   
829        False  False        False                 False          False   
830        False  False        False                 False          False   
831        False  False        False                 False          False   
832        False  False        False                 False          False   
833        False  False        False                 False          False   
834        False  False        False                 False          False   
835        False  False        False                 False          False   
836        False  False        False                 False          False   
837        False  False        False                 False          False   
838        False  False        False                 False          False   
839        False  False        False                 False          False   
840        False  False        False                 False          False   
841        False  False        False                 False          False   
842        False  False        False                 False          False   
843        False  False        False                 False          False   
844        False  False        False                 False          False   
845        False  False        False                 False          False   

     hollows_ratio  max.length_aspect_ratio  max.length_rectangularity  \
0            False                    False                      False   
1            False                    False                      False   
2            False                    False                      False   
3            False                    False                      False   
4            False                    False                      False   
5            False                    False                      False   
6            False                    False                      False   
7            False                    False                      False   
8            False                    False                      False   
9            False                    False                      False   
10           False                    False                      False   
11           False                    False                      False   
12           False                    False                      False   
13           False                    False                      False   
14           False                    False                      False   
15           False                    False                      False   
16           False                    False                      False   
17           False                    False                      False   
18           False                    False                      False   
19           False                    False                      False   
20           False                    False                      False   
21           False                    False                      False   
22           False                    False                      False   
23           False                    False                      False   
24           False                    False                      False   
25           False                    False                      False   
26           False                    False                      False   
27           False                    False                      False   
28           False                    False                      False   
29           False                    False                      False   
..             ...                      ...                        ...   
816          False                    False                      False   
817          False                    False                      False   
818          False                    False                      False   
819          False                    False                      False   
820          False                    False                      False   
821          False                    False                      False   
822          False                    False                      False   
823          False                    False                      False   
824          False                    False                      False   
825          False                    False                      False   
826          False                    False                      False   
827          False                    False                      False   
828          False                    False                      False   
829          False                    False                      False   
830          False                    False                      False   
831          False                    False                      False   
832          False                    False                      False   
833          False                    False                      False   
834          False                    False                      False   
835          False                    False                      False   
836          False                    False                      False   
837          False                    False                      False   
838          False                    False                      False   
839          False                    False                      False   
840          False                    False                      False   
841          False                    False                      False   
842          False                    False                      False   
843          False                    False                      False   
844          False                    False                      False   
845          False                    False                      False   

     pr.axis_aspect_ratio  pr.axis_rectangularity  radius_ratio  \
0                   False                   False         False   
1                   False                   False         False   
2                   False                   False         False   
3                   False                   False         False   
4                   False                   False         False   
5                   False                   False         False   
6                   False                   False         False   
7                   False                   False         False   
8                   False                   False         False   
9                   False                   False         False   
10                  False                   False         False   
11                  False                   False         False   
12                  False                   False         False   
13                  False                   False         False   
14                  False                   False         False   
15                  False                   False         False   
16                  False                   False         False   
17                  False                   False         False   
18                  False                   False         False   
19                  False                   False         False   
20                  False                   False         False   
21                  False                   False         False   
22                  False                   False         False   
23                  False                   False         False   
24                  False                   False         False   
25                  False                   False         False   
26                  False                   False         False   
27                  False                   False         False   
28                  False                   False         False   
29                  False                   False         False   
..                    ...                     ...           ...   
816                 False                   False         False   
817                 False                   False         False   
818                 False                   False         False   
819                 False                   False         False   
820                 False                   False         False   
821                 False                   False         False   
822                 False                   False         False   
823                 False                   False         False   
824                 False                   False         False   
825                 False                   False         False   
826                 False                   False         False   
827                 False                   False         False   
828                 False                   False         False   
829                 False                   False         False   
830                 False                   False         False   
831                 False                   False         False   
832                 False                   False         False   
833                 False                   False         False   
834                 False                   False         False   
835                 False                   False         False   
836                 False                   False         False   
837                 False                   False         False   
838                 False                   False         False   
839                 False                   False         False   
840                 False                   False         False   
841                 False                   False         False   
842                 False                   False         False   
843                 False                   False         False   
844                 False                   False         False   
845                 False                   False         False   

     scaled_radius_of_gyration  scaled_radius_of_gyration.1  scaled_variance  \
0                        False                        False            False   
1                        False                        False            False   
2                        False                        False            False   
3                        False                        False            False   
4                        False                        False            False   
5                        False                        False            False   
6                        False                        False            False   
7                        False                        False            False   
8                        False                        False            False   
9                        False                        False            False   
10                       False                        False            False   
11                       False                        False            False   
12                       False                        False            False   
13                       False                        False            False   
14                       False                        False            False   
15                       False                        False            False   
16                       False                        False            False   
17                       False                        False            False   
18                       False                        False            False   
19                       False                        False            False   
20                       False                        False            False   
21                       False                        False            False   
22                       False                        False            False   
23                       False                        False            False   
24                       False                        False            False   
25                       False                        False            False   
26                       False                        False            False   
27                       False                        False            False   
28                       False                        False            False   
29                       False                        False            False   
..                         ...                          ...              ...   
816                      False                        False            False   
817                      False                        False            False   
818                      False                        False            False   
819                      False                        False            False   
820                      False                        False            False   
821                      False                        False            False   
822                      False                        False            False   
823                      False                        False            False   
824                      False                        False            False   
825                      False                        False            False   
826                      False                        False            False   
827                      False                        False            False   
828                      False                        False            False   
829                      False                        False            False   
830                      False                        False            False   
831                      False                        False            False   
832                      False                        False            False   
833                      False                        False            False   
834                      False                        False            False   
835                      False                        False            False   
836                      False                        False            False   
837                      False                        False            False   
838                      False                        False            False   
839                      False                        False            False   
840                      False                        False            False   
841                      False                        False            False   
842                      False                        False            False   
843                      False                        False            False   
844                      False                        False            False   
845                      False                        False            False   

     scaled_variance.1  scatter_ratio  skewness_about  skewness_about.1  \
0                False          False           False             False   
1                False          False           False             False   
2                False          False           False             False   
3                False          False           False             False   
4                False          False           False             False   
5                False          False           False             False   
6                False          False           False             False   
7                False          False           False             False   
8                False          False           False             False   
9                False          False           False             False   
10               False          False           False             False   
11               False          False           False             False   
12               False          False           False             False   
13               False          False           False             False   
14               False          False           False             False   
15               False          False           False             False   
16               False          False           False             False   
17               False          False           False             False   
18               False          False           False             False   
19               False          False           False             False   
20               False          False           False             False   
21               False          False           False             False   
22               False          False           False             False   
23               False          False           False             False   
24               False          False           False             False   
25               False          False           False             False   
26               False          False           False             False   
27               False          False           False             False   
28               False          False           False             False   
29               False          False           False             False   
..                 ...            ...             ...               ...   
816              False          False           False             False   
817              False          False           False             False   
818              False          False           False             False   
819              False          False           False             False   
820              False          False           False             False   
821              False          False           False             False   
822              False          False           False             False   
823              False          False           False             False   
824              False          False           False             False   
825              False          False           False             False   
826              False          False           False             False   
827              False          False           False             False   
828              False          False           False             False   
829              False          False           False             False   
830              False          False           False             False   
831              False          False           False             False   
832              False          False           False             False   
833              False          False           False             False   
834              False          False           False             False   
835              False          False           False             False   
836              False          False           False             False   
837              False          False           False             False   
838              False          False           False             False   
839              False          False           False             False   
840              False          False           False             False   
841              False          False           False             False   
842              False          False           False             False   
843              False          False           False             False   
844              False          False           False             False   
845              False          False           False             False   

     skewness_about.2  
0               False  
1               False  
2               False  
3               False  
4               False  
5               False  
6               False  
7               False  
8               False  
9               False  
10              False  
11              False  
12              False  
13              False  
14              False  
15              False  
16              False  
17              False  
18              False  
19              False  
20              False  
21              False  
22              False  
23              False  
24              False  
25              False  
26              False  
27              False  
28              False  
29              False  
..                ...  
816             False  
817             False  
818             False  
819             False  
820             False  
821             False  
822             False  
823             False  
824             False  
825             False  
826             False  
827             False  
828             False  
829             False  
830             False  
831             False  
832             False  
833             False  
834             False  
835             False  
836             False  
837             False  
838             False  
839             False  
840             False  
841             False  
842             False  
843             False  
844             False  
845             False  

[846 rows x 19 columns]
Out[137]:
circularity class compactness distance_circularity elongatedness hollows_ratio max.length_aspect_ratio max.length_rectangularity pr.axis_aspect_ratio pr.axis_rectangularity radius_ratio scaled_radius_of_gyration scaled_radius_of_gyration.1 scaled_variance scaled_variance.1 scatter_ratio skewness_about skewness_about.1 skewness_about.2
0 False False False False False False False False False False False False False False False False False False False
1 False False False False False False False False False False False False False False False False False False False
2 False False False False False False False False False False False False False False False False False False False
3 False False False False False False False False False False False False False False False False False False False
4 False False False False False False True False True False False False True False False False False False False
5 False False False False False False False False False False False False False False False False False False False
6 False False False False False False False False False False False False False False False False False False False
7 False False False False False False False False False False False False False False False False False False False
8 False False False False False False False False False False False False False False False False False False False
9 False False False False False False False False False False False False False False False False False False False
10 False False False False False False False False False False False False False False False False False False False
11 False False False False False False False False False False False False False False False False False False False
12 False False False False False False False False False False False False False False False False False False False
13 False False False False False False False False False False False False False False False False False False False
14 False False False False False False False False False False False False False False False False False False False
15 False False False False False False False False False False False False False False False False False False False
16 False False False False False False False False False False False False False False False False False False False
17 False False False False False False False False False False False False False False False False False False False
18 False False False False False False False False False False False False False False False False False False False
19 False False False False False False False False False False False False False False False False False False False
20 False False False False False False False False False False False False False False False False False False False
21 False False False False False False False False False False False False False False False False False False False
22 False False False False False False False False False False False False False False False False False False False
23 False False False False False False False False False False False False False False False False False False False
24 False False False False False False False False False False False False False False False False False False False
25 False False False False False False False False False False False False False False False False False False False
26 False False False False False False False False False False False False False False False False False False False
27 False False False False False False False False False False False False False False False False False False False
28 False False False False False False False False False False False False False False False False False False False
29 False False False False False False False False False False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
816 False False False False False False False False False False False False False False False False False False False
817 False False False False False False False False False False False False False False False False False False False
818 False False False False False False False False False False False False False False False False False False False
819 False False False False False False False False False False False False False False False False False False False
820 False False False False False False False False False False False False False False False False False False False
821 False False False False False False False False False False False False False False False False False False False
822 False False False False False False False False False False False False False False False False False False False
823 False False False False False False False False False False False False False False False False False False False
824 False False False False False False False False False False False False False False False False False False False
825 False False False False False False False False False False False False False False False False False False False
826 False False False False False False False False False False False False False False False False False False False
827 False False False False False False False False False False False False False False False False False False False
828 False False False False False False False False False False False False False False False False False False False
829 False False False False False False False False False False False False False False False False False False False
830 False False False False False False False False False False False False False False False False False False False
831 False False False False False False False False False False False False False False False False False False False
832 False False False False False False False False False False False False False False False False False False False
833 False False False False False False False False False False False False False False False False False False False
834 False False False False False False False False False False False False False False False False False False False
835 False False False False False False False False False False False False False False True False False False False
836 False False False False False False False False False False False False False False False False False False False
837 False False False False False False False False False False False False False False False False False False False
838 False False False False False False False False False False False False False False False False False False False
839 False False False False False False False False False False False False False False False False False False False
840 False False False False False False False False False False False False False False False False False False False
841 False False False False False False False False False False False False False False False False False False False
842 False False False False False False False False False False False False False False False False False False False
843 False False False False False False False False False False False False False False False False False False False
844 False False False False False False False False False False False False False False False False False False False
845 False False False False False False False False False False False False False False False False False False False

846 rows × 19 columns

In [138]:
veh1_out = veh1[~((veh < (Q1 - 1.5 * IQR)) |(veh1 > (Q3 + 1.5 * IQR))).any(axis=1)]
veh1_out.shape
C:\Users\Akash Barwad\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
Out[138]:
(813, 19)
In [159]:
veh2=veh1_out
veh2.shape
Out[159]:
(813, 19)
In [140]:
veh1.shape
Out[140]:
(846, 19)

PCA part starts from here

In [160]:
X = veh2.iloc[:,0:17].values
y = veh2.iloc[:,18].values
from sklearn.preprocessing import StandardScaler
In [161]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 10)
In [162]:
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)
In [163]:
cov_matrix = np.cov(X_train.T)
print('Covariance Matrix \n%s', cov_matrix)

e_vals, e_vecs = np.linalg.eig(cov_matrix)

print('Eigenvectors \n%s' %e_vecs)
print('\nEigenvalues \n%s' %e_vals)
Covariance Matrix 
%s [[ 1.00176056  0.67303392  0.79719405  0.73128119  0.14226364  0.4897036
   0.81465354 -0.79265308  0.81656737  0.67471764  0.79852657  0.82081443
   0.58457404 -0.22514708  0.20633538  0.1376959   0.28394189]
 [ 0.67303392  1.00176056  0.79669218  0.6273473   0.15330433  0.56075265
   0.84708563 -0.81675129  0.84343605  0.96151887  0.80216833  0.83752265
   0.92804391  0.0902563   0.16192697 -0.00591522 -0.14006059]
 [ 0.79719405  0.79669218  1.00176056  0.80121117  0.20216171  0.66252609
   0.91277666 -0.91517946  0.90056848  0.78322186  0.88794423  0.89737534
   0.71146759 -0.20436425  0.11929571  0.24828495  0.11139934]
 [ 0.73128119  0.6273473   0.80121117  1.00176056  0.64795189  0.4625814
   0.77366716 -0.8352769   0.74652657  0.56538905  0.77419265  0.76197567
   0.53703184 -0.39555464  0.0393724   0.14745539  0.41265989]
 [ 0.14226364  0.15330433  0.20216171  0.64795189  1.00176056  0.12795773
   0.14163715 -0.25369252  0.10942912  0.08996209  0.15394392  0.12665473
   0.09981121 -0.34707899 -0.09365954 -0.05106815  0.41185533]
 [ 0.4897036   0.56075265  0.66252609  0.4625814   0.12795773  1.00176056
   0.49350662 -0.50157516  0.49276247  0.64732382  0.40420005  0.45577812
   0.39579378 -0.30658716  0.10369878  0.13530406  0.04777365]
 [ 0.81465354  0.84708563  0.91277666  0.77366716  0.14163715  0.49350662
   1.00176056 -0.97625965  0.99167835  0.81639187  0.98011702  0.99829371
   0.80522309  0.03697039  0.08634074  0.19842877 -0.02688949]
 [-0.79265308 -0.81675129 -0.91517946 -0.8352769  -0.25369252 -0.50157516
  -0.97625965  1.00176056 -0.95216357 -0.77621348 -0.96529805 -0.96072489
  -0.76877563  0.05628881 -0.06604732 -0.17390605 -0.07940512]
 [ 0.81656737  0.84343605  0.90056848  0.74652657  0.10942912  0.49276247
   0.99167835 -0.95216357  1.00176056  0.82011071  0.96540125  0.99207234
   0.80223464  0.04842655  0.09371673  0.2002221  -0.04647376]
 [ 0.67471764  0.96151887  0.78322186  0.56538905  0.08996209  0.64732382
   0.81639187 -0.77621348  0.82011071  1.00176056  0.75608407  0.8045637
   0.87080493  0.07830698  0.1605245   0.01560081 -0.13938041]
 [ 0.79852657  0.80216833  0.88794423  0.77419265  0.15394392  0.40420005
   0.98011702 -0.96529805  0.96540125  0.75608407  1.00176056  0.97967948
   0.78872743  0.04994741  0.05986088  0.19665229 -0.00139958]
 [ 0.82081443  0.83752265  0.89737534  0.76197567  0.12665473  0.45577812
   0.99829371 -0.96072489  0.99207234  0.8045637   0.97967948  1.00176056
   0.80209956  0.04576427  0.08921677  0.19065291 -0.02210679]
 [ 0.58457404  0.92804391  0.71146759  0.53703184  0.09981121  0.39579378
   0.80522309 -0.76877563  0.80223464  0.87080493  0.78872743  0.80209956
   1.00176056  0.24308557  0.18186495 -0.04770615 -0.24973902]
 [-0.22514708  0.0902563  -0.20436425 -0.39555464 -0.34707899 -0.30658716
   0.03697039  0.05628881  0.04842655  0.07830698  0.04994741  0.04576427
   0.24308557  1.00176056 -0.07308904 -0.08195419 -0.85140082]
 [ 0.20633538  0.16192697  0.11929571  0.0393724  -0.09365954  0.10369878
   0.08634074 -0.06604732  0.09371673  0.1605245   0.05986088  0.08921677
   0.18186495 -0.07308904  1.00176056 -0.05403233  0.09956892]
 [ 0.1376959  -0.00591522  0.24828495  0.14745539 -0.05106815  0.13530406
   0.19842877 -0.17390605  0.2002221   0.01560081  0.19665229  0.19065291
  -0.04770615 -0.08195419 -0.05403233  1.00176056  0.04995322]
 [ 0.28394189 -0.14006059  0.11139934  0.41265989  0.41185533  0.04777365
  -0.02688949 -0.07940512 -0.04647376 -0.13938041 -0.00139958 -0.02210679
  -0.24973902 -0.85140082  0.09956892  0.04995322  1.00176056]]
Eigenvectors 
[[-2.71334066e-01  1.20206747e-01  4.72154231e-02  1.56869429e-01
   1.58475547e-01  2.65690158e-01  1.46818644e-01  7.71084057e-01
  -3.69533907e-01  4.21180705e-02  1.09454139e-02 -1.48200492e-03
   6.14976978e-02  1.70502068e-03 -1.74269446e-01 -1.39775092e-02
   2.13875903e-02]
 [-2.89453073e-01 -1.32335559e-01  1.88297668e-01 -8.61928654e-02
  -1.41514553e-01 -7.56203706e-02 -4.05039921e-01  4.55234192e-02
  -1.71499347e-02 -2.37742713e-01  4.46787925e-02  5.64420152e-04
   3.97210550e-01  7.36260068e-02  7.77818481e-02  5.28581121e-01
   3.95584643e-01]
 [-3.03078431e-01  6.95116758e-02 -5.16670127e-02  1.10984545e-01
  -1.03423332e-01  1.70823710e-02  1.37979823e-01 -2.73653175e-01
  -3.10049992e-01  4.29663738e-02  7.92187377e-01 -3.18141636e-03
  -1.67763764e-01 -1.24169779e-02  1.19153679e-01 -4.25671379e-02
   1.16427516e-01]
 [-2.63706299e-01  2.91968978e-01 -6.67524803e-02 -2.09077432e-01
   8.68730517e-02 -1.42276142e-01  1.55358226e-01 -5.71601300e-02
  -1.27533366e-01 -8.36735789e-02 -1.71654390e-01 -2.20712094e-02
   4.74830955e-01  2.65488997e-03  5.71407945e-01 -3.59815169e-01
  -8.47766265e-02]
 [-7.37773033e-02  3.67938167e-01  2.29725370e-03 -5.67716703e-01
  -3.07225177e-03 -5.73252255e-01  1.03782695e-01  1.80855176e-01
  -2.31535187e-03 -4.28430629e-02  6.42184336e-02  1.53979490e-02
  -2.73637481e-01 -5.65001751e-03 -2.42000579e-01  1.58376247e-01
  -2.26949031e-03]
 [-1.91234203e-01  9.68387083e-02  1.60851706e-01  2.29558968e-01
  -7.37216108e-01 -8.70996667e-02  3.66572597e-01  6.63124514e-02
   1.80697116e-01  3.06358154e-01 -1.95832637e-01 -1.41529397e-02
  -1.71217086e-02 -8.78861418e-03  7.85951571e-02  1.17677014e-01
   1.34698237e-02]
 [-3.14059300e-01 -5.54204246e-02 -1.02201955e-01  7.48961011e-03
   1.04371760e-01  6.87282573e-02  1.04975265e-01 -9.83830054e-02
   1.38874968e-01 -9.46739107e-02 -1.33719376e-01  7.90684389e-01
  -8.83247139e-02 -3.90127797e-01 -6.87734039e-02 -1.09386405e-02
   1.23755686e-01]
 [ 3.10312677e-01 -1.98298916e-02  9.58321540e-02  5.41173141e-02
  -9.63851994e-02 -4.36229356e-02 -1.21773639e-01  2.17406591e-01
  -2.68289765e-01 -1.65361829e-01 -2.08532507e-02  2.16251414e-01
  -3.80946143e-01 -1.01813325e-01  6.24411197e-01  2.82983096e-01
  -2.14323735e-01]
 [-3.11202402e-01 -6.97107118e-02 -9.59237500e-02  3.04579514e-02
   9.79151902e-02  8.30312398e-02  9.55092317e-02 -4.64223785e-02
   9.31880281e-02 -2.26122244e-01 -2.41548706e-01 -6.39116137e-03
  -4.49147565e-01  6.76425368e-01  1.44240292e-01 -6.88872185e-02
   2.39516975e-01]
 [-2.81592469e-01 -1.34679728e-01  2.00715657e-01 -4.24900544e-03
  -2.56065087e-01 -3.70767252e-02 -3.58095026e-01  2.09838319e-01
   3.13223856e-01 -3.60480787e-01  1.95863032e-01 -1.58020767e-02
  -8.37822979e-02 -3.01827069e-02 -5.76030199e-02 -3.91511099e-01
  -4.38014739e-01]
 [-3.05708718e-01 -4.77536591e-02 -1.41685198e-01 -2.85197784e-02
   1.96085454e-01  9.20427282e-02  8.71606302e-02 -1.40979826e-01
   6.09093940e-02  1.74861390e-01  1.36289234e-02  3.02141048e-02
   1.08659709e-01  2.00403100e-01 -2.30730579e-02  4.98698066e-01
  -6.87309593e-01]
 [-3.11320875e-01 -6.02537289e-02 -1.03359238e-01  7.38624496e-03
   1.46124477e-01  9.57476906e-02  7.96132592e-02 -6.27871751e-02
   1.02687102e-01 -1.55915806e-01 -1.83821724e-01 -5.70095660e-01
  -2.86753869e-01 -5.75915444e-01  1.17926950e-01  1.46366254e-01
   7.08557316e-02]
 [-2.69016134e-01 -2.16682502e-01  1.81864622e-01 -1.39812035e-01
   1.70075325e-02 -1.11930092e-01 -4.37269108e-01 -1.12415272e-01
  -3.60805608e-01  5.89248604e-01 -2.36427094e-01  8.10031788e-03
  -1.97008163e-01 -3.57841834e-02  9.04672147e-03 -1.94389450e-01
  -8.83089314e-03]
 [ 2.06404871e-02 -5.66522966e-01 -1.11528140e-01 -1.75349911e-01
   1.73505003e-01 -1.99603915e-01  2.04703903e-01  3.45098649e-01
   3.71454709e-01  3.08049985e-01  2.62069939e-01 -3.99176656e-03
   5.64946942e-02  9.02679894e-03  2.84166017e-01 -3.15759970e-02
   1.33815633e-01]
 [-4.35973104e-02  8.89043886e-03  6.14681090e-01  4.63293973e-01
   4.12625798e-01 -4.38418103e-01  1.69523810e-01 -9.13245419e-02
   4.89388310e-02 -4.95654928e-02 -9.74496146e-03 -2.38130513e-03
   9.92048861e-03  3.59376434e-03 -1.36512185e-02  1.27234617e-02
  -2.41899815e-02]
 [-5.30383731e-02  7.11267981e-02 -6.32336972e-01  5.11756978e-01
  -1.45407103e-02 -4.85843815e-01 -2.78261056e-01  1.11102508e-01
  -1.73627773e-02  2.20742869e-02 -4.26012907e-02 -9.46194076e-03
   3.17231746e-02 -2.24330447e-03 -2.40955338e-02 -7.81677734e-03
   2.79377806e-03]
 [-1.50704330e-02  5.72531420e-01  7.89911669e-02  7.28168687e-02
   1.92644530e-01  2.31930785e-01 -3.31138328e-01  9.34557057e-02
   4.84694543e-01  3.46181931e-01  1.51311729e-01  2.69646176e-02
  -1.04136875e-01  7.61882691e-03  2.08214508e-01  5.33027921e-02
   1.25133232e-01]]

Eigenvalues 
[9.70498964e+00 2.58145261e+00 1.19948501e+00 1.14331327e+00
 8.64368272e-01 6.55185311e-01 3.17786946e-01 2.17864457e-01
 1.00304855e-01 7.41329330e-02 5.93348695e-02 3.35072789e-04
 3.57053557e-02 8.35195971e-03 2.69815257e-02 1.85471577e-02
 2.17903391e-02]
In [165]:
tot = sum(e_vals)
var_exp = [( i /tot ) * 100 for i in sorted(e_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
Cumulative Variance Explained [ 56.98784364  72.14617179  79.1895656   85.90311813  90.97870149
  94.82595939  96.69200907  97.97131241  98.56030401  98.9956137
  99.34402893  99.55369131  99.71212723  99.84008043  99.94898962
  99.99803245 100.        ]
In [166]:
# Ploting 
plt.figure(figsize=(10 , 5))
plt.bar(range(1, e_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, e_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
In [175]:
eigen_pairs = [(np.abs(e_vals[i]), e_vecs[:,i]) for i in range(len(e_vals))]
eigen_pairs.sort(reverse=True)
eigen_pairs[:7]
Out[175]:
[(9.70498963897293,
  array([-0.27133407, -0.28945307, -0.30307843, -0.2637063 , -0.0737773 ,
         -0.1912342 , -0.3140593 ,  0.31031268, -0.3112024 , -0.28159247,
         -0.30570872, -0.31132088, -0.26901613,  0.02064049, -0.04359731,
         -0.05303837, -0.01507043])),
 (2.5814526100817123,
  array([ 0.12020675, -0.13233556,  0.06951168,  0.29196898,  0.36793817,
          0.09683871, -0.05542042, -0.01982989, -0.06971071, -0.13467973,
         -0.04775366, -0.06025373, -0.2166825 , -0.56652297,  0.00889044,
          0.0711268 ,  0.57253142])),
 (1.1994850051042234,
  array([ 0.04721542,  0.18829767, -0.05166701, -0.06675248,  0.00229725,
          0.16085171, -0.10220196,  0.09583215, -0.09592375,  0.20071566,
         -0.1416852 , -0.10335924,  0.18186462, -0.11152814,  0.61468109,
         -0.63233697,  0.07899117])),
 (1.1433132684759448,
  array([ 0.15686943, -0.08619287,  0.11098454, -0.20907743, -0.5677167 ,
          0.22955897,  0.00748961,  0.05411731,  0.03045795, -0.00424901,
         -0.02851978,  0.00738624, -0.13981203, -0.17534991,  0.46329397,
          0.51175698,  0.07281687])),
 (0.8643682716555754,
  array([ 0.15847555, -0.14151455, -0.10342333,  0.08687305, -0.00307225,
         -0.73721611,  0.10437176, -0.0963852 ,  0.09791519, -0.25606509,
          0.19608545,  0.14612448,  0.01700753,  0.173505  ,  0.4126258 ,
         -0.01454071,  0.19264453])),
 (0.6551853108346146,
  array([ 0.26569016, -0.07562037,  0.01708237, -0.14227614, -0.57325226,
         -0.08709967,  0.06872826, -0.04362294,  0.08303124, -0.03707673,
          0.09204273,  0.09574769, -0.11193009, -0.19960392, -0.4384181 ,
         -0.48584381,  0.23193079])),
 (0.3177869464997268,
  array([ 0.14681864, -0.40503992,  0.13797982,  0.15535823,  0.10378269,
          0.3665726 ,  0.10497526, -0.12177364,  0.09550923, -0.35809503,
          0.08716063,  0.07961326, -0.43726911,  0.2047039 ,  0.16952381,
         -0.27826106, -0.33113833]))]
In [178]:
w = np.hstack((eigen_pairs[0][1].reshape(17,1), 
                      eigen_pairs[1][1].reshape(17,1)))
print('Matrix W:\n', w)
X_pca = X_train.dot(w)
Matrix W:
 [[-0.27133407  0.12020675]
 [-0.28945307 -0.13233556]
 [-0.30307843  0.06951168]
 [-0.2637063   0.29196898]
 [-0.0737773   0.36793817]
 [-0.1912342   0.09683871]
 [-0.3140593  -0.05542042]
 [ 0.31031268 -0.01982989]
 [-0.3112024  -0.06971071]
 [-0.28159247 -0.13467973]
 [-0.30570872 -0.04775366]
 [-0.31132088 -0.06025373]
 [-0.26901613 -0.2166825 ]
 [ 0.02064049 -0.56652297]
 [-0.04359731  0.00889044]
 [-0.05303837  0.0711268 ]
 [-0.01507043  0.57253142]]
In [180]:
X_train.shape, w.shape, X_pca.shape
Out[180]:
((569, 17), (17, 2), (569, 2))
In [181]:
X_pca
Out[181]:
array([[-3.17335885, -0.65924239],
       [-1.61895414,  0.70052913],
       [ 5.05334739, -1.95490136],
       ...,
       [ 0.08009224,  0.78708036],
       [ 2.45561652,  1.15162265],
       [ 1.71599444,  2.68719083]])

Confirmed the code with SVM.

In [182]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
In [183]:
clf = SVC()
clf.fit(X_train, y_train)
print ('score', clf.score(X_test, y_test))
score 0.9672131147540983
In [ ]:
 
In [ ]: